package com.maistars.spider.infrastructure.adapter;

import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
import com.maistars.spider.infrastructure.hanlp.HanlpDocService;

/**
 * 内容抽取算法
 * @author dhr
 * @date 2020/11/28 下午9:02
 */
public class WebCollectorAdapter {

    public static News getNewsByUrl(String url) throws Exception {
        return ContentExtractor.getNewsByUrl(url);
    }

    public static void main(String[] args) throws Exception {
        News news = ContentExtractor.getNewsByUrl("https://mp.weixin.qq.com/s/9S6QQENzWqCJJZGQVNHRGg");
        System.out.println(news.getUrl());
        System.out.println(news.getTitle());
        System.out.println(news.getTime());
        System.out.println(news.getContent());

        System.out.println(HanlpDocService.extractSummary(news.getContent(),150));
    }

}
